In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import PolynomialFeatures
In [2]:
# Function to generate target value for a given x.
true_func = lambda X: np.cos(1.5 * np.pi * X)
In [3]:
np.random.seed(0)
# Training Set: No. of random samples used for training the model
n_samples = 30
x = np.sort(np.random.rand(n_samples))
y = true_func(x) + np.random.randn(n_samples) * 0.1
# Test Set: 100 samples for which we want the model to predict value
n_test = 100
x_test = np.linspace(0, 1, n_test)
y_test_actual = true_func(x_test) + np.random.randn(n_test) * 0.1
In [4]:
x[:5]
Out[4]:
In [5]:
x[:5],y[:5]
Out[5]:
In [6]:
# Function to add more features from existing features
# in this case degree is the desired order of polynomials
# Example degree 3 with 1 feature x would output: x,x^2,x^3
# Similary, if there are multiple features x1,x2: x1,x2,x1*x2,x1^2,x1^2*x2,x1^3....and so forth
def generate_higher_order(degrees, x):
# Generate higher order features from a given set of features.
poly = PolynomialFeatures(degree = degrees,
include_bias = False)
x_new = poly.fit_transform(x)
df = pd.DataFrame(x_new)
df.columns = df.columns.map(lambda n: 'x' + str(n))
return df
In [7]:
data_path = r'..\Data\RegressionExamples\under_over_fit_30samples'
In [8]:
# degrees for feature generation
degrees = [1, 4, 15]
In [9]:
# Generate training set for each of the degree
for d in degrees:
df = generate_higher_order(d, x.reshape((n_samples, 1)))
df['y'] = y
df.to_csv(os.path.join(data_path,'fit_degree_{0}_example_train{1}.csv'.format(d,n_samples)),
index = True,
index_label = 'Row')
In [10]:
# Generate Evaluation set. Contains all the features and target.
# Generate Test set. Contains only the features. AWSML would predict the target
for d in degrees:
df = generate_higher_order(d, x_test.reshape((n_test, 1)))
df.to_csv(os.path.join(data_path,'fit_degree_{0}_example_test{1}.csv'.format(d, n_samples)),
index = True,
index_label = 'Row')
df['y'] = y_test_actual
df.to_csv(os.path.join(data_path,'fit_degree_{0}_example_eval{1}.csv'.format(d, n_samples)),
index = True,
index_label = 'Row')
In [11]:
# Pull Predictions
df_samples = pd.read_csv(os.path.join(data_path, 'fit_degree_1_example_train30.csv'),
index_col = 'Row')
df_actual = pd.read_csv(os.path.join(data_path, 'fit_degree_1_example_eval30.csv'),
index_col = 'Row')
df_d1_predicted = pd.read_csv(
os.path.join(data_path,'output_deg_1',
'bp-aYBztCIPIdb-fit_degree_1_example_test30.csv.gz'))
df_d1_predicted.columns = ["Row","y_predicted"]
In [12]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df_samples['x0'],
y = df_samples['y'],
color = 'b',
label='samples')
plt.scatter(x = df_actual['x0'],
y = df_actual['y'],
color = 'r',
label = 'true function')
plt.scatter(x = df_actual['x0'],
y = df_d1_predicted['y_predicted'],
color = 'g',
label = 'predicted with degree 1')
plt.title('Model with degree 1 feature - Underfit')
plt.grid(True)
plt.legend()
Out[12]:
Polynomial with degree 1 is a straight line - Underfitting
Training RMSE:0.5063, Evaluation RMSE:0.4308, Baseline RMSE:0.689
In [13]:
fig = plt.figure(figsize = (12, 8))
plt.boxplot([df_actual['y'],
df_d1_predicted['y_predicted']],
labels=['actual','predicted with deg1'])
plt.title('Box Plot - Actual, Predicted')
plt.ylabel('Target Attribute')
plt.grid(True)
In [14]:
df_d4_predicted = pd.read_csv(
os.path.join(data_path,'output_deg_4',
'bp-W4oBOhwClbH-fit_degree_4_example_test30.csv.gz'))
df_d4_predicted.columns = ["Row","y_predicted"]
In [15]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df_samples['x0'],
y = df_samples['y'],
color = 'b',
label = 'samples')
plt.scatter(x = df_actual['x0'],
y = df_actual['y'],
color = 'r',
label = 'true function')
plt.scatter(x = df_actual['x0'],
y = df_d4_predicted['y_predicted'],
color = 'g',
label = 'predicted with degree 4')
plt.title('Model with degree 4 features - normal fit')
plt.grid(True)
plt.legend()
Out[15]:
Good Fit with degree 4 polynomial
Training RMSE:0.2563, Evaluation RMSE:0.1493, Baseline RMSE:0.689
In [16]:
fig = plt.figure(figsize = (12, 8))
plt.boxplot([df_actual['y'],
df_d1_predicted['y_predicted'],
df_d4_predicted['y_predicted']],
labels=['actual','predicted with deg1','predicted with deg4'])
plt.title('Box Plot - Actual, Predicted')
plt.ylabel('Target Attribute')
plt.grid(True)
In [17]:
df_d15_predicted = pd.read_csv(
os.path.join(data_path,'output_deg_15',
'bp-rBWxcnPN3zu-fit_degree_15_example_test30.csv.gz'))
df_d15_predicted.columns = ["Row","y_predicted"]
In [18]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df_samples['x0'],
y = df_samples['y'],
color = 'b',
label = 'samples')
plt.scatter(x = df_actual['x0'],
y = df_actual['y'],
color = 'r',
label = 'true function')
plt.scatter(x = df_actual['x0'],
y = df_d15_predicted['y_predicted'],
color = 'g',
label = 'predicted with degree 15')
plt.grid(True)
plt.legend()
Out[18]:
Not quite over fitting as shown in sci-kit example; fit is actually pretty good here.
Training RMSE:0.2984, Evaluation RMSE:0.1222, Baseline RMSE:0.689
In [19]:
fig = plt.figure(figsize = (12, 8))
plt.boxplot([df_actual['y'],
df_d1_predicted['y_predicted'],
df_d4_predicted['y_predicted'],
df_d15_predicted['y_predicted']],
labels = ['actual','predicted deg1','predicted deg4','predicted deg15'])
plt.title('Box Plot - Actual, Predicted')
plt.ylabel('Target Attribute')
plt.grid(True)
To add polynomial features that combines all input features, use sci-kit module library. Anaconda includes these modules by default.
We saw good performance with degree 4 and any additional feature may bring incremental improvement, but with added complexity of managing features.